In [32]:
# Regression. Partial F-tests and Lack-of-Fit Tests
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data") # Change the working directory
auto = pd.read_csv("Auto.csv") # Read the data file in the CSV format
In [56]:
# 1. PARTIAL F-TEST
# Full model
auto['horsepower'] = pd.to_numeric(auto['horsepower'], errors='coerce')
reg_full = smf.ols('mpg ~ year + acceleration + horsepower + weight', data=auto).fit()
print(reg_full.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.809 Model: OLS Adj. R-squared: 0.807 Method: Least Squares F-statistic: 408.8 Date: Thu, 01 Aug 2024 Prob (F-statistic): 1.72e-137 Time: 18:23:43 Log-Likelihood: -1037.1 No. Observations: 392 AIC: 2084. Df Residuals: 387 BIC: 2104. Df Model: 4 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept -15.3889 4.671 -3.294 0.001 -24.574 -6.204 year 0.7511 0.052 14.381 0.000 0.648 0.854 acceleration 0.0802 0.100 0.803 0.422 -0.116 0.277 horsepower 0.0026 0.013 0.196 0.845 -0.024 0.029 weight -0.0066 0.000 -14.099 0.000 -0.008 -0.006 ============================================================================== Omnibus: 38.374 Durbin-Watson: 1.223 Prob(Omnibus): 0.000 Jarque-Bera (JB): 60.816 Skew: 0.639 Prob(JB): 6.22e-14 Kurtosis: 4.446 Cond. No. 8.35e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.35e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [60]:
# Reduced model
reg_reduced = smf.ols('mpg ~ horsepower + weight', data=auto).fit()
print(reg_reduced.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.706 Model: OLS Adj. R-squared: 0.705 Method: Least Squares F-statistic: 467.9 Date: Thu, 01 Aug 2024 Prob (F-statistic): 3.06e-104 Time: 18:24:16 Log-Likelihood: -1121.0 No. Observations: 392 AIC: 2248. Df Residuals: 389 BIC: 2260. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 45.6402 0.793 57.540 0.000 44.081 47.200 horsepower -0.0473 0.011 -4.267 0.000 -0.069 -0.026 weight -0.0058 0.001 -11.535 0.000 -0.007 -0.005 ============================================================================== Omnibus: 35.336 Durbin-Watson: 0.858 Prob(Omnibus): 0.000 Jarque-Bera (JB): 45.973 Skew: 0.683 Prob(JB): 1.04e-10 Kurtosis: 3.974 Cond. No. 1.15e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.15e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [70]:
# ANOVA.
# This partial F-test shows if adding variables 'year' and 'acceleration' into the model is significant.
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)
df_resid ssr df_diff ss_diff F Pr(>F) 0 389.0 6993.845437 0.0 NaN NaN NaN 1 387.0 4558.049951 2.0 2435.795486 103.405279 1.050026e-36
In [ ]:
# The p-value comparing these two models is very significant, so the two variables make a significant
# contribution for the prediction of mpg, in addition of weight and horsepower.
In [74]:
# 2. LACK-OF-FIT TEST
# Here we test linearity by comparing the linear model (reduced) with the model with dummy variables,
# one for each value of X (full model that does not assume linearity).
# Reduced model
reg_reduced = smf.ols('mpg ~ cylinders', data=auto).fit()
# Full model with dummy variables
auto['cyl_categorical'] = auto['cylinders'].astype('category')
reg_full = smf.ols('mpg ~ C(cyl_categorical)', data=auto).fit()
# ANOVA
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)
df_resid ssr df_diff ss_diff F Pr(>F) 0 395.0 9638.365011 0.0 NaN NaN NaN 1 392.0 8758.095487 3.0 880.269523 13.133207 3.450919e-08
In [76]:
# Low p-value shows that the relation of mpg to the number of cylinders is non-linear.
# Continuous case – what to do if the X-variable has no repeated values?
# Rounding horsepower to the nearest 10:
auto['hp_rounded'] = auto['horsepower'].round(-1)
# Reduced model
reg_reduced = smf.ols('mpg ~ horsepower', data=auto).fit()
# Full model with dummy variables
auto['hp_rounded'] = auto['hp_rounded'].astype('category')
reg_full = smf.ols('mpg ~ C(hp_rounded)', data=auto).fit()
# ANOVA
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)
df_resid ssr df_diff ss_diff F Pr(>F) 0 390.0 9385.915872 0.0 NaN NaN NaN 1 373.0 7101.888522 17.0 2284.02735 7.056468 6.662265e-15
In [ ]:
# The full model is significantly better. So, mpg is a non-linear function of horsepower.